home *** CD-ROM | disk | FTP | other *** search
Text File | 1992-12-19 | 33.5 KB | 1,205 lines |
- /*
- * rpcRecovery.c --
- *
- * The routines here maintain up/down state about other hosts.
- * Recovery actions that are registered via Rpc_HostNotify are
- * called-back when a host crashes and when it reboots.
- * Regular message traffic plus explicit pings are used to determine
- * the state of other hosts. The main external procedures are
- * Rpc_HostIsDown, used to query the state of another host, and
- * Rpc_WaitForHost, used to block a process until a host reboots.
- * (Rpc_WaitForHost isn't used much. Instead, modules rely on the
- * recovery callbacks to indicate that a host is back to life, and
- * they block processes in their own way.)
- *
- * One impact of these recovery hooks on the RPC system is that service
- * requests from a client that is just rebooting are blocked until
- * the recovery actions complete.
- *
- * Copyright 1987 Regents of the University of California
- * All rights reserved.
- */
-
- #ifndef lint
- static char rcsid[] = "$Header: /cdrom/src/kernel/Cvsroot/kernel/rpc/rpcRecovery.c.old,v 9.0 89/09/12 15:18:27 douglis Stable $ SPRITE (Berkeley)";
- #endif not lint
-
-
- #include "sprite.h"
- #include "rpc.h"
- #include "rpcInt.h"
- #include "sync.h"
- #include "hash.h"
- #include "mem.h"
- #include "trace.h"
-
- /*
- * The state of other hosts is kept in a hash table keyed on SpriteID.
- * This state is maintained by RpcHostAlive and RpcHostDead, which are
- * called in turn after packet reception or RPC timeout, respectively.
- * RpcHostDead is also called by the Rpc_Daemon if it can't get an
- * explicit acknowledgment from a client.
- */
- static Hash_Table recoveryHashTableStruct;
- static Hash_Table *recovHashTable = &recoveryHashTableStruct;
-
- typedef struct RecovHostState {
- int state; /* flags defined below */
- int clientState; /* flags defined in rpc.h */
- int spriteID; /* Sprite Host ID */
- int bootID; /* Timestamp from RPC header */
- Time time; /* Time of last message */
- Sync_Condition alive; /* Notified when host comes up */
- Sync_Condition recovery; /* Notified when recovery is complete */
- } RecovHostState;
-
- /*
- * Access to the hash table is monitored.
- */
- Sync_Lock rpcRecoveryLock;
- #define LOCKPTR (&rpcRecoveryLock)
-
- /*
- * Host state:
- * RECOV_STATE_UNKNOWN Initial state.
- * RECOV_HOST_ALIVE Set when we receive a message from the host
- * RECOV_HOST_DEAD Set when an RPC times out.
- *
- * RECOV_CRASH_CALLBACKS Set during the crash call-backs, this is used
- * to block RPC server processes until the
- * crash recovery actions have completed.
- * RECOV_HOST_PINGING Set while there are pinging call-backs scheduled
- * RECOV_HOST_BOOTING Set while there are pinging call-backs scheduled
- *
- * RECOV_WAITING artificial state to trace Rpc_WaitForHost
- * RECOV_CRASH artificial state to trace RpcCrashCallBacks
- * RECOV_REBOOT artificial state to trace RpcRebootCallBacks
- */
- #define RECOV_STATE_UNKNOWN 0x0
- #define RECOV_HOST_ALIVE 0x1
- #define RECOV_HOST_DEAD 0x2
-
- #define RECOV_CRASH_CALLBACKS 0x0100
- #define RECOV_HOST_PINGING 0x0200
- #define RECOV_HOST_BOOTING 0x0400
-
- #define RECOV_WAITING 0x4
- #define RECOV_CRASH 0x8
- #define RECOV_REBOOT 0x10
-
- /*
- * A host is "pinged" (to see when it reboots) at an interval determined by
- * rpcPingSeconds.
- */
- int rpcPingSeconds = 30;
-
- /*
- * After a host reboots we pause a bit before attempting recovery. This
- * allows a host to complete boot-time start up. If we don't pause the
- * ping done by the recovery call backs may fail and we may erroneously
- * think that the other guy crashed right away.
- */
- int rpcRecoveryPause = 30; /* Seconds */
-
- /*
- * Other kernel modules can arrange call-backs when a host reboots.
- * The following list structure is used to keep these. The calling
- * sequence of the callback is as follows:
- * (*proc)(spriteID, clientData, when)
- * where 'when' is RPC_WHEN_HOST_DOWN or RPC_WHEN_HOST_REBOOTS (never both).
- */
-
- typedef struct {
- List_Links links;
- void (*proc)();
- int flags; /* RPC_WHEN_HOST_DOWN, RPC_WHEN_HOST_REBOOTS */
- ClientData clientData;
- } NotifyElement;
-
- List_Links rpcNotifyList;
-
- /*
- * A trace is kept for debugging/understanding the host state transisions.
- */
- typedef struct RpcRecovTraceRecord {
- int spriteID; /* Host ID whose state changed */
- int state; /* Their new state */
- } RpcRecovTraceRecord;
-
- /*
- * Tracing events, these describe the trace record. Note that some
- * trace types are defined in rpc.h for use with Rpc_HostTrace.
- *
- * RECOV_CUZ_WAIT Wait in Rpc_WaitForHost
- * RECOV_CUZ_WAKEUP Wakeup in Rpc_WaitForHost
- * RECOV_CUZ_INIT First time we were interested in the host
- * RECOV_CUZ_REBOOT We detected a reboot
- * RECOV_CUZ_CRASH We detected a crash
- * RECOV_CUZ_DONE Recovery actions completed
- * RECOV_CUZ_PING_CHK We are pinging the host to check it out
- * RECOV_CUZ_PING_ASK We are pinging the host because we were asked
- */
- #define RECOV_CUZ_WAIT 0x1
- #define RECOV_CUZ_WAKEUP 0x2
- #define RECOV_CUZ_INIT 0x4
- #define RECOV_CUZ_REBOOT 0x8
- #define RECOV_CUZ_CRASH 0x10
- #define RECOV_CUZ_DONE 0x20
- #define RECOV_CUZ_PING_CHK 0x40
- #define RECOV_CUZ_PING_ASK 0x80
-
- Trace_Header rpcRecovTraceHdr;
- Trace_Header *rpcRecovTraceHdrPtr = &rpcRecovTraceHdr;
- int rpcRecovTraceLength = 50;
- Boolean rpcRecovTracing = TRUE;
-
- #ifndef CLEAN
-
- #define RECOV_TRACE(zspriteID, zstate, event) \
- if (rpcRecovTracing) {\
- RpcRecovTraceRecord rec;\
- rec.spriteID = zspriteID;\
- rec.state = zstate;\
- Trace_Insert(rpcRecovTraceHdrPtr, event, &rec);\
- }
- #else
-
- #define RECOV_TRACE(zspriteID, zstate, event)
-
- #endif not CLEAN
- /*
- * Forward declarations.
- */
- void RpcRebootCallBacks();
- void RpcCrashCallBacks();
- void MarkRecoveryComplete();
- int GetHostState();
- void StartPinging();
- void CheckHost();
- void StopPinging();
-
-
- /*
- *----------------------------------------------------------------------
- *
- * RpcInitRecovery --
- *
- * Set up the data structures used by the RpcRecovery module.
- *
- * Results:
- * None.
- *
- * Side effects:
- * None.
- *
- *----------------------------------------------------------------------
- */
-
- void
- RpcInitRecovery()
- {
- Hash_Init(recovHashTable, 8, HASH_ONE_WORD_KEYS);
- List_Init(&rpcNotifyList);
- Trace_Init(rpcRecovTraceHdrPtr, rpcRecovTraceLength,
- sizeof(RpcRecovTraceRecord), 0);
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Rpc_HostNotify --
- *
- * Add a call-back for other modules to use when a host crashes/reboots.
- * The 'when' parameter specifies when to callback the client procedure.
- * If RPC_WHEN_HOST_DOWN then the procedure is called when the RPC
- * module has gotten a timeout trying to reach the host. If it is
- * RPC_WHEN_HOST_REBOOTS then the call-back is made when the RPC
- * module detects a reboot due to the bootID changing. If both
- * are specified then the call-back is made at both times.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Entry added to notify list.
- *
- *----------------------------------------------------------------------
- */
- void
- Rpc_HostNotify(proc, clientData, when)
- void (*proc)();
- ClientData clientData;
- int when; /* RPC_WHEN_HOST_DOWN, RPC_WHEN_HOST_REBOOTS */
- {
- register NotifyElement *notifyPtr;
-
- notifyPtr = (NotifyElement *) Mem_Alloc(sizeof(NotifyElement));
- notifyPtr->proc = proc;
- notifyPtr->clientData = clientData;
- notifyPtr->flags = when;
- List_InitElement((List_Links *) notifyPtr);
- List_Insert((List_Links *) notifyPtr, LIST_ATREAR(&rpcNotifyList));
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Rpc_HostIsDown --
- *
- * This decides if the specified host is down, and will make sure
- * that the host is being "pinged" if the caller wants to find
- * out (via the callbacks setup in Rpc_HostNotify) when the host
- * comes back to life. If the host is known to be down this routine
- * returns TRUE and makes sure pinging is initiated (if needed).
- * Otherwise, if there hasn't been recent message traffic
- * (within the last 10 seconds) then this will ping the host to find
- * out if it's still up. There are two cases then, the host isn't
- * up, or it is booting but it's RPC service is not ready yet.
- * We return FALSE so that our caller doesn't think the host
- * has crashed
- *
- * Results:
- * SUCCESS if the host is up, FAILURE if it doesn't respond to
- * pings or is known to be down, and RPC_SERVICE_DISABLED if
- * the host says so.
- *
- * Side effects:
- * May do a ping. If the 'ping' parameter is TRUE this will make
- * sure that pinging is in progress if the host is down.
- *
- *----------------------------------------------------------------------
- */
-
- ReturnStatus
- Rpc_HostIsDown(spriteID, ping)
- int spriteID;
- Boolean ping; /* If TRUE, we make sure the host is being pinged
- * if it is down now */
- {
- register ReturnStatus status = SUCCESS;
-
- if (spriteID == NET_BROADCAST_HOSTID) {
- Sys_Panic(SYS_WARNING, "Rpc_HostIsDown, got broadcast address\n");
- return(SUCCESS);
- }
- switch (GetHostState(spriteID)) {
- case RECOV_STATE_UNKNOWN:
- RECOV_TRACE(spriteID, RECOV_STATE_UNKNOWN, RECOV_CUZ_PING_ASK);
- status = Rpc_Ping(spriteID);
- break;
- case RECOV_HOST_ALIVE:
- status = SUCCESS;
- break;
- case RECOV_HOST_DEAD:
- status = FAILURE;
- break;
- }
- if (status != SUCCESS && ping) {
- StartPinging(spriteID);
- }
- return(status);
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Rpc_WaitForHost --
- *
- * Block the current process (at an interruptable priority) until
- * the given host comes back up. This is used when retrying
- * filesystem operations when a fileserver goes down, for example.
- *
- * Results:
- * TRUE if the wait was interrupted.
- *
- * Side effects:
- * The current process is blocked
- * until messages from the host indicate it is up.
- *
- *----------------------------------------------------------------------
- */
- Boolean
- Rpc_WaitForHost(spriteID)
- int spriteID; /* Host to monitor */
- {
- /*
- * Set up the hosts state (dead or alive) by pinging it.
- * If it's down we drop into a monitored routine to do
- * the actual waiting. It will check again to make sure
- * we don't sleep on an alive host.
- */
- if (Rpc_HostIsDown(spriteID, TRUE) == FAILURE) {
- RECOV_TRACE(spriteID, RECOV_STATE_UNKNOWN, RECOV_CUZ_WAIT);
- return(RpcWaitForHostInt(spriteID));
- } else {
- return(FALSE);
- }
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * RpcWaitForHostInt --
- *
- * Block the current process (at an interruptable priority) until
- * the given host comes back up. Our caller should have already
- * probed to host with Rpc_HostIsDown so that pinging is already
- * initiated.
- *
- * Results:
- * TRUE is the wait was interrupted by a signal.
- *
- * Side effects:
- * If the host is thought down, the current process is blocked
- * until messages from the host indicate it is up.
- *
- *----------------------------------------------------------------------
- */
- ENTRY Boolean
- RpcWaitForHostInt(spriteID)
- int spriteID; /* Host to monitor */
- {
- Hash_Entry *hashPtr;
- RecovHostState *hostPtr;
- Boolean sigPending = FALSE;
-
- LOCK_MONITOR;
-
- if (spriteID <= 0 || spriteID == rpc_SpriteID) {
- Sys_Panic(SYS_FATAL, "RpcWaitForHostInt, bad hostID %d\n", spriteID);
- UNLOCK_MONITOR;
- return(FALSE);
- }
-
- hashPtr = Hash_Find(recovHashTable, spriteID);
- if (hashPtr->value == (Address)NIL) {
- Sys_Panic(SYS_FATAL, "RpcWaitForHostInt, no host state\n");
- UNLOCK_MONITOR;
- return;
- } else {
- hostPtr = (RecovHostState *)hashPtr->value;
- }
- while (!sigPending && (hostPtr->state & RECOV_HOST_DEAD)) {
- sigPending = Sync_Wait(&hostPtr->alive, TRUE);
- }
- RECOV_TRACE(hostPtr->spriteID, hostPtr->state, RECOV_CUZ_WAKEUP);
-
- UNLOCK_MONITOR;
- return(sigPending);
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Rpc_HostPrint --
- *
- * Print out a statement concerning a host. This maps to a
- * string hostname if possible, and prints out the message.
- * The strings come from netAddresses.c (but should be imported
- * via Net_RouteInstall system call).
- *
- * MOVE THIS TO NET
- *
- * Results:
- * None.
- *
- * Side effects:
- * Sys_Printf.
- *
- *----------------------------------------------------------------------
- */
-
- void
- Rpc_HostPrint(spriteID, string)
- int spriteID;
- char *string;
- {
- char *hostName;
-
- Net_SpriteIDToName(spriteID, &hostName);
- if (hostName == (char *)NIL) {
- Sys_Printf("Sprite Host <%d> %s\n", spriteID, string);
- } else {
- Sys_Printf("Sprite Host %s (%d) %s\n", hostName, spriteID, string);
- }
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Rpc_HostTrace --
- *
- * Add an entry to the rpc recovery trace.
- *
- * Results:
- * None.
- *
- * Side effects:
- * None.
- *
- *----------------------------------------------------------------------
- */
-
- ENTRY void
- Rpc_HostTrace(spriteID, event)
- int spriteID;
- int event;
- {
- LOCK_MONITOR;
-
- RECOV_TRACE(spriteID, RECOV_STATE_UNKNOWN, event);
-
- UNLOCK_MONITOR;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Rpc_HostGetState --
- *
- * Return the client state associated with a host. The recovery host
- * table is a convenient object keyed on spriteID. Other modules can
- * set their own state in the table (beyond the simple up/down state
- * mainted by the rest of this module), and retrieve it with this call.
- *
- * Results:
- * A copy of the clientState field. 0 is returned if there is no
- * host table entry.
- *
- * Side effects:
- * None.
- *
- *----------------------------------------------------------------------
- */
-
- ENTRY int
- Rpc_HostGetState(spriteID)
- int spriteID;
- {
- Hash_Entry *hashPtr;
- RecovHostState *hostPtr;
- int result = 0;
-
- LOCK_MONITOR;
-
- hashPtr = Hash_LookOnly(recovHashTable, spriteID);
- if (hashPtr != (Hash_Entry *)NIL) {
- hostPtr = (RecovHostState *)hashPtr->value;
- if (hostPtr != (RecovHostState *)NIL) {
- result = hostPtr->clientState;
- }
- }
- UNLOCK_MONITOR;
- return(result);
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Rpc_HostSetState --
- *
- * Set the client state associated with a host. This completely
- * overwrites the previous value of the client state.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Sets the clientState field of the host state. This will add an
- * entry to the host table if one doesn't alreay exist. Its RPC
- * up/down state is set to "unknown" in this case.
- *
- *----------------------------------------------------------------------
- */
-
- ENTRY void
- Rpc_HostSetState(spriteID, state)
- int spriteID;
- int state;
- {
- Hash_Entry *hashPtr;
- RecovHostState *hostPtr;
-
- LOCK_MONITOR;
-
- hashPtr = Hash_Find(recovHashTable, spriteID);
- hostPtr = (RecovHostState *)hashPtr->value;
- if (hostPtr == (RecovHostState *)NIL) {
- hostPtr = Mem_New(RecovHostState);
- hashPtr->value = (Address)hostPtr;
-
- Byte_Zero(sizeof(RecovHostState), (Address)hostPtr);
- hostPtr->state = RECOV_STATE_UNKNOWN;
- hostPtr->spriteID = spriteID;
- }
- hostPtr->clientState = state;
- UNLOCK_MONITOR;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * RpcHostAlive --
- *
- * Mark the host as being alive. This is called when we've received
- * a message from the host. It uses state from the host table and
- * the bootID parameter to detect reboots. If a reboot is detected
- * but we thought the host was up then the Crash call-backs are invoked.
- * In any case, a reboot invokes the Reboot call-backs. (Call-backs
- * are installed with Rpc_HostNotify.) Finally, a time stamp is
- * kept so we can check when we last got a message from a host.
- *
- * This procedure is called from client RPC upon successful completion
- * of an RPC, and by server RPC upon reciept of a client request.
- * These two cases are identified by the 'asyncRecovery' parameter.
- * Servers want synchronous recovery so they don't service anything
- * until state associated with that client has been cleaned up via
- * the Crash call-backs. So RpcHostAlive blocks (if !asyncRecovery)
- * until the crash call-backs are complete. Clients don't have the
- * same worries so they let the crash call-backs complete in the
- * background (asyncRecovery is TRUE).
- *
- * Results:
- * None.
- *
- * Side effects:
- * Updates the boot timestamp of the other host. Procedures installed
- * with Rpc_HostNotify are called when the bootID changes. A timestamp
- * of when this message was received is obtained from the "cheap" clock
- * so we can tell later if there has been recent message traffic.
- *
- *----------------------------------------------------------------------
- */
-
- ENTRY void
- RpcHostAlive(spriteID, bootID, asyncRecovery)
- int spriteID; /* Host ID of the message sender */
- int bootID; /* Boot time stamp from message header */
- Boolean asyncRecovery; /* TRUE means do recovery call-backs in
- * the background. FALSE causes the process
- * to wait until crash recovery is complete. */
- {
- register Hash_Entry *hashPtr;
- register RecovHostState *hostPtr;
- Boolean reboot = FALSE; /* Used to control print statements at reboot */
-
- LOCK_MONITOR;
- if (spriteID == NET_BROADCAST_HOSTID || bootID == 0) {
- /*
- * Don't track the broadcast address. Also ignore zero valued
- * bootIDs. These come from hosts at early boot time, or
- * in certain error conditions like trying to send too much
- * data in a single RPC.
- */
- UNLOCK_MONITOR;
- return;
- }
-
- hashPtr = Hash_Find(recovHashTable, spriteID);
- if (hashPtr->value == (Address)NIL) {
- /*
- * Initialize the host's state. This is the first time we've talked
- * to it since we've been up, so take no action.
- */
- hostPtr = Mem_New(RecovHostState);
- hashPtr->value = (Address)hostPtr;
-
- Byte_Zero(sizeof(RecovHostState), (Address)hostPtr);
- hostPtr->state = RECOV_HOST_ALIVE;
- hostPtr->spriteID = spriteID;
- hostPtr->bootID = bootID;
-
- Rpc_HostPrint(spriteID, "is up");
- RECOV_TRACE(spriteID, RECOV_HOST_ALIVE, RECOV_CUZ_INIT);
- } else {
- hostPtr = (RecovHostState *)hashPtr->value;
- }
- if (hostPtr != (RecovHostState *)NIL) {
- /*
- * Have to read the clock in order to suppress repeated pings,
- * see GetHostState and Rpc_HostIsDown.
- */
- Timer_GetTimeOfDay(&hostPtr->time, (int *)NIL, (Boolean *)NIL);
- /*
- * Check for a rebooted peer by comparing boot time stamps.
- * The first process to detect this initiates recovery actions.
- */
- if (hostPtr->bootID != bootID) {
- Rpc_HostPrint(spriteID, "rebooted");
- hostPtr->bootID = bootID;
- reboot = TRUE;
- RECOV_TRACE(spriteID, hostPtr->state, RECOV_CUZ_REBOOT);
- if (hostPtr->state & RECOV_HOST_ALIVE) {
- /*
- * A crash occured un-detected. We do the crash call-backs
- * first, and block server processes in the meantime.
- * RECOV_CRASH_CALLBACKS flag is reset by RpcCrashCallBacks.
- * The host is marked dead here so we fall into the
- * switch below and call the reboot callbacks.
- */
- RECOV_TRACE(spriteID, RECOV_CRASH, RECOV_CUZ_REBOOT);
- hostPtr->state &= ~RECOV_HOST_ALIVE;
- hostPtr->state |= (RECOV_HOST_DEAD | RECOV_CRASH_CALLBACKS);
- Proc_CallFunc(RpcCrashCallBacks, spriteID, 0);
- }
- }
- /*
- * Block servers until crash recovery actions complete.
- * Servers are synchronous with respect to reboot recovery.
- * This blocks requests from clients until after the
- * recovery actions complete.
- */
- if (! asyncRecovery) {
- while (hostPtr->state & RECOV_CRASH_CALLBACKS) {
- Sync_Wait(&hostPtr->recovery, FALSE);
- if (sys_ShuttingDown) {
- Sys_Printf("Warning, Server exiting from RpcHostAlive\n");
- Proc_Exit(1);
- }
- }
- }
- /*
- * Now that we've taken care of crash recovery, we see if the host
- * is newly up. If so, invoke the reboot call-backs and then notify
- * waiting processes. This means clientA (us) may start
- * re-opening files from serverB (the other guy) at the same time
- * as clientA (us) is closing files that serverB had had open.
- * ie. both the crash and reboot call backs may proceed in parallel.
- */
- switch(hostPtr->state & ~(RECOV_CRASH_CALLBACKS|RECOV_HOST_PINGING)) {
- case RECOV_HOST_ALIVE:
- /*
- * Host already alive.
- */
- break;
- case RECOV_HOST_DEAD: {
- register int wait;
- /*
- * Notify interested parties that the host is up. If the host
- * has done a full reboot we wait a bit before pounding on
- * it with our re-open requests. This gives it a chance
- * to create RPC server processes, etc. so we don't think
- * it crashed because we tried to talk to it too soon.
- */
- if ( !reboot ) {
- Rpc_HostPrint(spriteID, "is back again");
- wait = 0;
- } else {
- wait = timer_IntOneSecond * rpcRecoveryPause;
- }
- hostPtr->state &= ~RECOV_HOST_DEAD;
- hostPtr->state |= RECOV_HOST_ALIVE;
- Sync_Broadcast(&hostPtr->alive)
- Proc_CallFunc(RpcRebootCallBacks, spriteID, wait);
- break;
- default:
- Sys_Panic(SYS_WARNING, "Unexpected state <%x> for ",
- hostPtr->state);
- Rpc_HostPrint(spriteID, "");
- break;
- }
- }
- }
- UNLOCK_MONITOR;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * RpcHostDead --
- *
- * Change the host's state to "dead". This is called from client RPC
- * when an RPC timed out with no response. It is also called by the
- * Rpc_Daemon when it can't recontact a client to get an explicit
- * acknowledgment.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Sets the state in the host state table to dead. Pings are not
- * initiated here because we may or may not be interested in
- * the other host. See Rpc_HostIsDown.
- *
- *----------------------------------------------------------------------
- */
-
- ENTRY void
- RpcHostDead(spriteID)
- int spriteID;
- {
- register Hash_Entry *hashPtr;
- register RecovHostState *hostPtr;
-
- LOCK_MONITOR;
- if (spriteID == NET_BROADCAST_HOSTID || rpc_NoTimeouts) {
- /*
- * If rpcNoTimeouts is set the Rpc_Daemon may still call us if
- * it can't get an acknowledgment from a host to close down
- * a connection. We ignore this so that we don't take action
- * against the offending host (who is probably in the debugger)
- */
- UNLOCK_MONITOR;
- return;
- }
-
- hashPtr = Hash_LookOnly(recovHashTable, spriteID);
- if (hashPtr != (Hash_Entry *)NIL) {
- hostPtr = (RecovHostState *)hashPtr->value;
- if (hostPtr != (RecovHostState *)NIL) {
- switch(hostPtr->state & ~(RECOV_CRASH_CALLBACKS|
- RECOV_HOST_PINGING)) {
- case RECOV_HOST_DEAD:
- /*
- * Host already dead.
- */
- break;
- case RECOV_STATE_UNKNOWN:
- case RECOV_HOST_ALIVE:
- hostPtr->state &= ~(RECOV_HOST_ALIVE|RECOV_STATE_UNKNOWN);
- hostPtr->state |= RECOV_HOST_DEAD|RECOV_CRASH_CALLBACKS;
- Rpc_HostPrint(spriteID, "is down");
- RECOV_TRACE(spriteID, hostPtr->state, RECOV_CUZ_CRASH);
- Proc_CallFunc(RpcCrashCallBacks, spriteID, 0);
- break;
- }
- }
- }
- UNLOCK_MONITOR;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * RpcRebootCallBacks --
- *
- * This calls the call-back procedures installed by other modules
- * via Rpc_HostNotify. It is invoked asynchronously from RpcHostAlive
- * when that procedure detects a reboot. It does an explict ping
- * of the other host to make sure it is ready for our recovery actions.
- * This will reschedule itself for later if the host isn't ready.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Invoke the call-backs.
- *
- *----------------------------------------------------------------------
- */
-
- void
- RpcRebootCallBacks(data, callInfoPtr)
- ClientData data;
- Proc_CallInfo *callInfoPtr;
- {
- ReturnStatus status;
- register NotifyElement *notifyPtr;
- register int spriteID = (int)data;
-
- status = Rpc_Ping(spriteID);
- switch(status) {
- case RPC_SERVICE_DISABLED:
- Rpc_HostPrint(spriteID, "still booting");
- callInfoPtr->interval = rpcRecoveryPause * timer_IntOneSecond;
- break;
- case RPC_TIMEOUT:
- Rpc_HostPrint(spriteID, "not responding");
- callInfoPtr->interval = rpcRecoveryPause * timer_IntOneSecond;
- break;
- case SUCCESS:
- LIST_FORALL(&rpcNotifyList, (List_Links *)notifyPtr) {
- if (notifyPtr->flags & RPC_WHEN_HOST_REBOOTS) {
- (*notifyPtr->proc)(spriteID, notifyPtr->clientData,
- RPC_WHEN_HOST_REBOOTS);
- }
- }
- RECOV_TRACE(spriteID, RECOV_REBOOT, RECOV_CUZ_DONE);
- callInfoPtr->interval = 0; /* Don't call again */
- break;
- }
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * RpcCrashCallBacks --
- *
- * Invoked asynchronously from RpcHostDead so that other modules
- * can clean up behind the crashed host. When done the host
- * is marked as having recovery complete. This unblocks server
- * processes stalled in RpcHostAlive.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Invoke the call-backs with the RPC_WHEN_HOST_DOWN flag.
- * Clears the recovery in progress flag checked in RpcHostAlive.
- *
- *----------------------------------------------------------------------
- */
-
- void
- RpcCrashCallBacks(data, callInfoPtr)
- ClientData data;
- Proc_CallInfo *callInfoPtr;
- {
- register NotifyElement *notifyPtr;
- register int spriteID = (int)data;
-
- LIST_FORALL(&rpcNotifyList, (List_Links *)notifyPtr) {
- if (notifyPtr->flags & RPC_WHEN_HOST_DOWN) {
- (*notifyPtr->proc)(spriteID, notifyPtr->clientData,
- RPC_WHEN_HOST_DOWN);
- }
- }
- MarkRecoveryComplete(spriteID);
- RECOV_TRACE(spriteID, RECOV_CRASH, RECOV_CUZ_DONE);
- callInfoPtr->interval = 0; /* Don't call again */
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * MarkRecoveryComplete --
- *
- * The recovery call-backs have completed, and this procedure's
- * job is to mark that fact in the host hash table and to notify
- * any processes that are blocked in RpcHostAlive waiting for this.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Sets the state, if any, in the host state table.
- * Notifies the hostPtr->recovery condition
- *
- *----------------------------------------------------------------------
- */
-
- ENTRY static void
- MarkRecoveryComplete(spriteID)
- {
- register Hash_Entry *hashPtr;
- register RecovHostState *hostPtr;
-
- LOCK_MONITOR;
-
- hashPtr = Hash_LookOnly(recovHashTable, spriteID);
- if (hashPtr != (Hash_Entry *)NIL) {
- hostPtr = (RecovHostState *)hashPtr->value;
- if (hostPtr != (RecovHostState *)NIL) {
- hostPtr->state &= ~RECOV_CRASH_CALLBACKS;
- Sync_Broadcast(&hostPtr->recovery);
- }
- }
- UNLOCK_MONITOR;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * GetHostState --
- *
- * This looks into the host table to see and provides a guess
- * as to the host's current state. It uses a timestamp kept in
- * the host state to see if there's been recent message traffic.
- * If so, RECOV_HOST_ALIVE is returned. If not, RECOV_STATE_UNKNOWN
- * is returned and the caller should ping to make sure. Finally,
- * if it is known that the host is down already, then RECOV_HOST_DEAD
- * is returned.
- *
- * Results:
- * RECOV_STATE_UNKNOWN if the caller should ping to make sure.
- * RECOV_HOST_ALIVE if the host is up (recent message traffic).
- * RECOV_HOST_DEAD if the host is down (recent timeouts).
- *
- * Side effects:
- * None.
- *
- *----------------------------------------------------------------------
- */
-
- ENTRY int
- GetHostState(spriteID)
- int spriteID;
- {
- register Hash_Entry *hashPtr;
- register RecovHostState *hostPtr;
- register int state = RECOV_STATE_UNKNOWN;
- Time time;
-
- LOCK_MONITOR;
-
- hashPtr = Hash_LookOnly(recovHashTable, spriteID);
- if (hashPtr != (Hash_Entry *)NIL) {
- hostPtr = (RecovHostState *)hashPtr->value;
- if (hostPtr != (RecovHostState *)NIL) {
- state = hostPtr->state &
- ~(RECOV_CRASH_CALLBACKS|RECOV_HOST_PINGING);
- if (state == RECOV_HOST_ALIVE) {
- /*
- * Check for recent message traffic before admitting
- * that the other machine is up.
- */
- Timer_GetTimeOfDay(&time, (int *)NIL, (Boolean *)NIL);
- Time_Subtract(time, hostPtr->time, &time);
- if (Time_GT(time, time_TenSeconds)) {
- state = RECOV_STATE_UNKNOWN;
- }
- }
- }
- }
- UNLOCK_MONITOR;
- return(state);
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * StartPinging --
- *
- * Make sure there is a background pinging process for the host.
- * The state bit used to indicate pinging is reset by RpcHostCheck
- * after it finally gets in a good ping.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Starts the pinging callback if not already in progress.
- *
- *----------------------------------------------------------------------
- */
-
- ENTRY void
- StartPinging(spriteID)
- int spriteID;
- {
- register Hash_Entry *hashPtr;
- register RecovHostState *hostPtr;
-
- LOCK_MONITOR;
-
- hashPtr = Hash_LookOnly(recovHashTable, spriteID);
- hostPtr = (RecovHostState *)hashPtr->value;
- if ((hostPtr->state & RECOV_HOST_PINGING) == 0) {
- hostPtr->state |= RECOV_HOST_PINGING;
- Proc_CallFunc(CheckHost, spriteID, 0);
- }
- UNLOCK_MONITOR;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * CheckHost --
- *
- * This is the call back setup when a host is detected as crashed
- * and we want to find out when it comes back up. This pings
- * the remote host if it's down or there hasn't been recent traffic.
- * A side effect of a successful ping is a call to RpcHostAlive which
- * triggers the recovery actions.
- *
- * Results:
- * None.
- *
- * Side effects:
- * This will pings the host unless there has been recent message
- * traffic. It reschedules itself if the ping fails.
- *
- *----------------------------------------------------------------------
- */
-
- static void
- CheckHost(data, callInfoPtr)
- ClientData data;
- Proc_CallInfo *callInfoPtr;
- {
- register int spriteID = (int)data;
- register int state;
- ReturnStatus status = SUCCESS;
-
- state = GetHostState(spriteID);
- switch (state) {
- case RECOV_HOST_DEAD:
- case RECOV_STATE_UNKNOWN:
- RECOV_TRACE(spriteID, state, RECOV_CUZ_PING_CHK);
- status = Rpc_Ping(spriteID);
- break;
- case RECOV_HOST_ALIVE:
- break;
- }
- if (status != SUCCESS) {
- /*
- * Try again later if the host is still down.
- */
- callInfoPtr->interval = rpcPingSeconds * timer_IntOneSecond;
- } else {
- StopPinging(spriteID);
- callInfoPtr->interval = 0;
- }
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * StartPinging --
- *
- * Make sure there is a background pinging process for the host.
- * The state bit used to indicate pinging is reset by RpcHostCheck
- * after it finally gets in a good ping.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Starts the pinging callback if not already in progress.
- *
- *----------------------------------------------------------------------
- */
-
- ENTRY void
- StopPinging(spriteID)
- int spriteID;
- {
- register Hash_Entry *hashPtr;
- register RecovHostState *hostPtr;
-
- LOCK_MONITOR;
-
- hashPtr = Hash_LookOnly(recovHashTable, spriteID);
- hostPtr = (RecovHostState *)hashPtr->value;
- if ((hostPtr->state & RECOV_HOST_PINGING) == 0) {
- Sys_Panic(SYS_WARNING, "StopPinging found bad state\n");
- }
- hostPtr->state &= ~RECOV_HOST_PINGING;
- UNLOCK_MONITOR;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Rpc_PrintRecovTraceRecord --
- *
- * Format and print the client data part of a recovery trace record.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Sys_Printf to the display.
- *
- *----------------------------------------------------------------------
- */
- int
- Rpc_PrintRecovTraceRecord(clientData, event, printHeaderFlag)
- ClientData clientData; /* Client data in the trace record */
- int event; /* Type, or event, from the trace record */
- Boolean printHeaderFlag; /* If TRUE, a header line is printed */
- {
- RpcRecovTraceRecord *recPtr = (RpcRecovTraceRecord *)clientData;
- char *name;
- if (printHeaderFlag) {
- /*
- * Print column headers and a newline.
- */
- Sys_Printf("%10s %10s %17s\n", "Host", "State", "Event ");
- }
- if (clientData != (ClientData)NIL) {
- Net_SpriteIDToName(recPtr->spriteID, &name);
- if (name == (char *)NIL) {
- Sys_Printf("%10d ", recPtr->spriteID);
- } else {
- Sys_Printf("%10s ", name);
- }
- switch(recPtr->state & ~(RECOV_CRASH_CALLBACKS|RECOV_HOST_PINGING)) {
- case RECOV_STATE_UNKNOWN:
- Sys_Printf("%-8s", "Unknown");
- break;
- case RECOV_HOST_ALIVE:
- Sys_Printf("%-8s ", "Alive");
- break;
- case RECOV_HOST_DEAD:
- Sys_Printf("%-8s ", "Dead");
- break;
- case RECOV_WAITING:
- Sys_Printf("%-8s ", "Waiting");
- break;
- case RECOV_CRASH:
- Sys_Printf("%-8s ", "Crash callbacks");
- break;
- case RECOV_REBOOT:
- Sys_Printf("%-8s ", "Reboot callbacks");
- break;
- }
- Sys_Printf("%3s", (recPtr->state & RECOV_CRASH_CALLBACKS) ?
- " C " : " ");
- Sys_Printf("%3s", (recPtr->state & RECOV_HOST_PINGING) ?
- " P " : " ");
- switch(event) {
- case RECOV_CUZ_WAIT:
- Sys_Printf("waiting");
- break;
- case RECOV_CUZ_WAKEUP:
- Sys_Printf("wakeup");
- break;
- case RECOV_CUZ_INIT:
- Sys_Printf("init");
- break;
- case RECOV_CUZ_REBOOT:
- Sys_Printf("reboot");
- break;
- case RECOV_CUZ_CRASH:
- Sys_Printf("crash");
- break;
- case RECOV_CUZ_DONE:
- Sys_Printf("done");
- break;
- case RECOV_CUZ_PING_ASK:
- Sys_Printf("ping (ask)");
- break;
- case RECOV_CUZ_PING_CHK:
- Sys_Printf("ping (check)");
- break;
- case RPC_RECOV_TRACE_STALE:
- Sys_Printf("stale FS handle");
- break;
- default:
- Sys_Printf("(%x)", event);
- break;
- }
- /* Our caller prints a newline */
- }
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Rpc_PrintRecovTrace --
- *
- * Dump out the recovery trace. Called via a console L1 keystroke.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Prints to the console.
- *
- *----------------------------------------------------------------------
- */
-
- void
- Rpc_PrintRecovTrace(numRecs)
- int numRecs;
- {
- if (numRecs <= 0 || numRecs > rpcRecovTraceLength) {
- numRecs = rpcRecovTraceLength;
- }
- Sys_Printf("RECOVERY TRACE\n");
- Trace_Print(rpcRecovTraceHdrPtr, numRecs, Rpc_PrintRecovTraceRecord);
- }
-
-
-